clear
set more off


/**************************************************************/
/********** Collect duplicates and non-duplicates *************/
use "${raw}/nlrb/nlrb7799.dta",clear 
g nlrb_id=substr(idnlrb,1,11)
ds idnlrb,not

replace sic3 = "390" if sic3 == "39"
replace sic3 = "390" if sic3 == "39-"

duplicates drop `r(varlist)',force // 233 dropped

// County recodes
replace cntycode = "081" if nlrb_id == "29-RC-09084" & unit == "Z" & cntycode == ""


duplicates tag nlrb_id ,g(dups)


// Drop non duplicates
preserve
	keep if dups==0 // 101,794 obs
	drop dups
	save "${temp}/nlrb7799_1.dta",replace
restore

// 3160 observations with duplicates
drop if dups==0
drop dups

// not duplicate if 
duplicates tag nlrb_id unit,g(dups)
bys nlrb_id: egen check1=total(dups)

duplicates tag nlrb_id eligible vnounion,g(dups2)

order dups2 check
sort check nlrb_id unit

// Non-duplicates when distinct units and vote counts are different
preserve
	keep if check1==0 & dups2 == 0 // 1218 observations
	drop check1 dups2 dups 
	append using  "${temp}/nlrb7799_1.dta"
	save "${temp}/nlrb7799_1.dta",replace

restore

drop if check1==0 & dups2 == 0
drop check1 dups2 dups

g duplicate_obs = 1 // 1942

// Save a dataset which is the full data just identifying duplicates

append using "${temp}/nlrb7799_1.dta"
save "${wd}/nlrb/nlrb7799_1.dta",replace


/**************************************************************/
/************* Split Data into 3 Distinct Cases ***************/

keep if duplicate_obs == 1
cap drop dups

duplicates tag nlrb_id unit,g(dups)
bys nlrb_id: egen checker=total(dups)

// CASE 1 - Duplicates due to multiple units
// And one observation per unit
preserve
	keep if checker==0 // 94 kept
	drop dups checker
	save "${temp}/nlrb7799_2.dta",replace
restore

drop if checker==0
drop dups checker

bys nlrb_id unit: g num1=_N
bys nlrb_id: g num2=_N

* CASE 2 - Single unit if same number of observations at the unit and id level
// Duplicate observation at the unit level
preserve
	keep if num1==num2 // 1686
	drop num*
	save "${temp}/nlrb7799_3.dta",replace
restore

drop if num1==num2 // 162
drop num*

* CASE 3 - Duplicates at the unit_id level and more than one unit
save "${temp}/nlrb7799_4.dta",replace



/***************************************************************/
/************* CASE 3 DATA EXPLORATION AND FIXES ***************/

use "${temp}/nlrb7799_4.dta",clear

// Issue is multiple observations by unit
// Unit only identifies the type of workers - it does not seem to classify distinct voting units
// Meaning it does not seem to identify if there are two voting units that fall under the same broad
// occupation categories

// Generally with a few changes these can be reasonably ruled out as duplicates given
// the difference in eligibility counts
// The thing we cannot rule out however is that one observation represents an aggregate across all
// elections for that unit, and the other captures a count for one specific unit
// I think we have to assume that they represent counts for two distinct units
// But unfortunate possibility of double counting


// Suspect duplicates - differ only on irrelevant variable (or where one missing and one0)
drop if union_incumb=="" & nlrb_id=="02-RC-20011" & unit == "A"

// drop nonconclusive election and retain conclusive one
drop if dateclose=="81/07/" & nlrb_id=="24-RC-06596" 

// Suspect duplicates by SIC
drop if nlrb_id=="02-RC-21365" & sic3 == "836"
drop if nlrb_id=="32-RC-01322" & sic3 == "808"

// Duplicate by employer
drop if nlrb_id=="21-RC-16971" & employer == "NESTIER INC"

// duplicate by date closed
drop if  nlrb_id=="21-RD-01546" & dateclose == "79/02/" & unit == "Z"

// duplicate by union code - suspect typo
drop if  nlrb_id=="23-RC-05248" & union_l1_code == "2473" & unit == "A"
drop if  nlrb_id=="23-RC-05248" & union_l1_code == "2473" & unit == "W"

// duplicates
drop if  nlrb_id=="27-RC-05938" &  duplic == "2"

// duplicates by employer
drop if  nlrb_id=="32-RC-02600" &  employer == "TELEVENTS - W.T.C.I."

save "${temp}/nlrb7799_4_clean.dta",replace



/***************************************************************/
/************* CASE 1 DATA EXPLORATION AND FIXES ***************/
use "${temp}/nlrb7799_2.dta",clear

// Appears to be multi-union elections
// Vote count is the same but 

// Assume duplicate when same industry and location and vote counts
duplicates drop nlrb_id eligible vnounion cntycode stcode sic3 dispos vote_win,force

// From another data source some of these closing dates are not quite correct
bys nlrb_id: g ordering=_n
drop if ordering==2 & casenum=="08RC14544"
drop if ordering==2 & casenum=="01RC18440"
drop ordering

duplicates tag nlrb_id eligible vnounion,g(dups)

preserve
	keep if dups == 0 // 24
	save "${temp}/nlrb7799_2_clean.dta",replace 
restore

// For remaining cases merge back in the other data set
keep if dups>0  // 46 left

preserve 
	use "${raw}/nlrb/nlrb_representation_1961_1998.dta" ,clear
	drop nlrb_id
	
	duplicates drop _all,force

	g strtype="RC" if r_type==1
	replace strtype="RD" if r_type==2
	replace strtype="RM" if r_type==3

	tostring region, g(regionstr)
	replace regionstr="0"+regionstr if length(regionstr)==1

	tostring docket, g(strdocket)
	replace strdocket="00"+strdocket if length(strdocket)==3
	replace strdocket="0"+strdocket if length(strdocket)==4

	g  nlrb_id=regionstr+"-"+strtype+"-"+strdocket
	sort nlrb_id
	
	save "${temp}/nlrb_representation_1961_1998.dta",replace
	
	keep nlrb_id closed_date election_date unit sic

	
	save "${temp}/nlrb_representation_1961_1998_1.dta",replace

restore


// Keep most recent closing date where possible
replace sic3="390" if sic3=="39-"
destring sic3,g(sic)

count

merge 1:m nlrb_id  sic using "${temp}/nlrb_representation_1961_1998_1.dta",force
drop if _merge==2
drop _merge


// If different elections then retain
drop dups
duplicates tag nlrb_id eligible vnounion election_date,g(dups)

preserve
	keep if dups == 0
	append using "${temp}/nlrb7799_2_clean.dta"
	save "${temp}/nlrb7799_2_clean.dta",replace 
restore

keep if dups>0

// Keep most recent closing date where possible
bys nlrb_id: egen maxdate=max(closed_date)
keep if closed_date==maxdate

count

duplicates drop _all,force

// For remaining cases we are unsure if the election took place 
// comfortable that 09-RC-14074 is two distinct units
// Less sure for others but retain as distinct units
append using "${temp}/nlrb7799_2_clean.dta"
save "${temp}/nlrb7799_2_clean.dta",replace




/***************************************************************/
/************* CASE 2 DATA EXPLORATION AND FIXES ***************/
use "${temp}/nlrb7799_3.dta",clear

duplicates tag nlrb_id unit,g(dups)
tab dups

// drop where number of eligible workers quite different and otherwise things similar
// If number of eligible workers different we assume different units
bys nlrb_id unit: egen min_elig = min(eligible)
bys nlrb_id unit: egen max_elig = max(eligible)
g maxdiff = eligible - min_elig if eligible == max_elig & dups == 1
replace  maxdiff = max_elig - eligible if eligible == min_elig & dups == 1

g maxfraction = maxdiff/max_elig

preserve
	keep if maxfraction > .2 & dups == 1 // 804 cases
	drop dups maxfraction maxdiff max_e* min_e*
	
	save "${temp}/temp.dta",replace
restore


drop if maxfraction > .2 & dups == 1 // 882 left
drop dups maxfraction maxdiff max_e* min_e*
duplicates tag nlrb_id unit,g(dups)

preserve
	// Manual review of these
	keep if dups>2 // 87 observations

	append using "${temp}/temp.dta"
	save "${temp}/temp.dta",replace
restore

drop if dups>2
tab dups
drop dups

// Identify if duplicates are by date closed and date elec or not
duplicates tag nlrb_id unit sic3,g(dups1)
duplicates tag nlrb_id unit sic3 dateclose,g(dups2)
duplicates tag nlrb_id unit sic3 dateelec,g(dups3)

order dups*

preserve
	keep if dups2<dups1 | dups3<dups1 // 189 cases
	
	// Keep one where all counts are the same just date differs
	duplicates drop nlrb_id unit sic3 eligible vote_w* vnounion cntycode city,force
	drop dups*
	duplicates tag nlrb_id unit sic3,g(dups1)
	tab dups1
	g checker = 1 if dups == 0 // 39 cases resolved
	
	
	// if first election is not conclusive keep the conclusive election
	g temp = 1 if dups1>0 & var27 == "5"
	bys nlrb_id unit: egen tot = total(temp)
	drop if tot>0 & var27!="5" & checker == . // 16 drops
	
	drop dups*
	duplicates tag nlrb_id unit sic3,g(dups1)
	replace checker = 1 if dups == 0 & checker == . // 14 cases resolved
	
	
	// Recent closing date
	g year_clos = substr(dateclose,1,2)
	g month_clos = substr(dateclose,4,2)
	
	destring year_clos month_clos,replace
	replace year_clos = year_clos+1900
	
	generate recent_clos = ym(year_clos, month_clos)
	format %tm recent_clos	

	bys nlrb_id unit: egen recent = max(recent_clos)
	keep if recent_clos == recent | recent_clos == . |  checker == 1
	drop recent  recent_clos
	
	drop dups*
	duplicates tag nlrb_id unit sic3,g(dups1)
	replace checker = 1 if dups == 0 & checker == . // 17 cases resolved
	
	
	// keep more recent election
	// it appears that in some cases result was re-run
	g year_elect = substr(dateelec,1,2)
	replace year_elect = "" if year_elect == "7 " | year_elect == "0 " | year_elect == "1Z"
	
	g month_elect = substr(dateelec,4,2)
	replace month_elect = "" if  month_elect == "0 "
	
	destring year_elect month_elect,replace
	replace year_elect = year_elect+1900
	
	generate recent_elect = ym(year_elect, month_elect)
	format %tm recent_elect
	
	bys nlrb_id unit: egen recent = max(recent_elect)
	keep if recent_elect == recent | recent_elect == . |  checker == 1
	drop recent 
	
	drop dups*
	duplicates tag nlrb_id unit sic3,g(dups1)
	replace checker = 1 if dups == 0 & checker == . // 20 cases resolved
	
	// temp
	tab checker
	// 2 cases left, only one of these is possibly a duplicate
	
	append using "${temp}/temp.dta"
	save "${temp}/temp.dta",replace

restore


drop if dups2<dups1 | dups3<dups1 // 606 cases
drop dups*

// Keep if all same sic within a nlrb_id unit
duplicates tag nlrb_id unit sic3,g(dups)
bys nlrb_id unit: g num = _N

preserve
	// Keep if all same sic within a nlrb_id unit
	keep if dups+1 == num // 530 observations
	drop dups
	
	* Typos
	replace state1 = "WI" if state1 == "WIS" & nlrb_id == "30-RC-04237"
	replace address = "" if strpos(address,"0WON")>0
	replace state1 = "TX" if state1 == "TEX" & nlrb_id == "23-RC-05234"
	replace state2 = "MD" if state2 == "" & nlrb_id == "05-RC-13194"
	replace state1 = "CA" if state1 == "CAL" & nlrb_id == "21-RC-17208"

	
	// drop very near duplicates along relevant dimensions
	ds var19 duplic var21-var28 idnlrb fiscal* elect_id, not
	duplicates drop `r(varlist)',force
	
	// If date of election and closing is the same and counts differ then we can only assume non-duplicate
	duplicates tag nlrb_id unit sic3 eligible dateclose dateelec,g(dups)
	g checker = 1 if dups == 0 // 262 cases resolved
	drop dups 
	
	// Some cases where company name entered differently otherwise duplicate
	// Same with other variables
	ds var19 duplic var21-var28 idnlrb fiscal* elect_id employer size_cat ///
		union_incumb city unionname union*code union_partic state2fix, not
	duplicates drop  `r(varlist)',force
	duplicates tag nlrb_id unit sic3 eligible dateclose dateelec,g(dups)
	replace checker = 1 if dups == 0 & checker == . // 262 cases resolved
	drop dups 	
	
	drop if nlrb_id == "19-RC-10590" & datefile == ""
	drop if nlrb_id == "04-RC-17517" & datefile == "91/04/"
	drop if nlrb_id == "24-RC-06774" & state2 == "PR"
	drop if nlrb_id == "32-RD-00527" & state1 == "DC"
	drop if nlrb_id == "32-RC-01875" & state1 == "DC"
	drop if nlrb_id == "32-RC-01866" & state1 == "FL"
	drop if nlrb_id == "17-RC-09731" & state1 == "KAN"
	drop if nlrb_id == "04-RC-15846" & state1 == "PA"
	
	duplicates tag nlrb_id unit sic3 eligible dateclose dateelec,g(dups)
	replace checker = 1 if dups == 0 & checker == . 
	drop dups 	

	
	// 113 cases left
	
	drop if nlrb_id == "08-RC-14247" & duplic == "1"
		
	// If count aligns with challenged counts (var19) then retain the observation with less challenged votes
	// Assumption is that those votes have been resolved
	
	drop if nlrb_id == "04-RC-18906" & var19 != "00"
	drop if nlrb_id == "05-RC-13789" & var19 == "07"
	drop if nlrb_id == "07-RC-16009" & var19 == "16"
	drop if nlrb_id == "08-RC-15683" & var19 == "01"
	drop if nlrb_id == "09-RC-16512" & var19 == "16"
	drop if nlrb_id == "09-RC-16533" & var19 == "01"
	drop if nlrb_id == "09-RM-00899" & var19 == "04"
	drop if nlrb_id == "13-RC-15320" & var19 == "07"
	drop if nlrb_id == "16-RD-00882" & var19 == "01"
	drop if nlrb_id == "19-RC-07976" & var19 == "01"
	drop if nlrb_id == "20-RC-17034" & var19 == "01"
	drop if nlrb_id == "22-RC-08280" & var19 == "08"
	drop if nlrb_id == "27-RC-05686" & var19 == "01"
	drop if nlrb_id == "29-RC-03682" & var19 == "13"

	duplicates tag nlrb_id unit sic3 eligible dateclose dateelec,g(dups)
	replace checker = 1 if dups == 0 & checker == . 
	drop dups 

	// Select one observation when vote is similar but more votes in one line
	// Eligible number is large enough to be unlikely two units same size
	// Concern here is that there are two elections but the eligible count is wrong for one
	// of the two elections and really there are other eligible workers that should be accounted for
	// No way to determine this
	
	
	drop if nlrb_id == "01-RC-17668" & vote_win == 0
	drop if nlrb_id == "04-RC-14908" & vote_win == 39
	drop if nlrb_id == "07-RC-14990" & vnounion == 92
	drop if nlrb_id == "07-RC-15188" & vnounion == 10
	drop if nlrb_id == "08-RC-15686" & cntycode == ""
	drop if nlrb_id == "11-RC-05080" & vnounion == 0
	drop if nlrb_id == "17-RC-08890" & vote_win == 17
	drop if nlrb_id == "18-RC-12212" & vote_win == 4
	drop if nlrb_id == "22-RC-08848" & vote_win == 0
	drop if nlrb_id == "23-RC-04908" & vote_lose1 == 10
	drop if nlrb_id == "26-RC-05777" & vnounion == 61
	drop if nlrb_id == "29-RC-05921" & vnounion == 5
	drop if nlrb_id == "32-RC-02810" & vnounion == 3
	drop if nlrb_id == "33-RC-03022" & vote_lose1 == 0
	drop if nlrb_id == "36-RC-04232" & vote_lose1 == 12
	drop if nlrb_id == "36-RC-04260" & vnounion == 29
	drop if nlrb_id == "25-RC-07076" & vote_win == 10
	drop if nlrb_id == "21-RC-16001" & vote_lose1 == 4


	duplicates tag nlrb_id unit sic3 eligible dateclose dateelec,g(dups)
	replace checker = 1 if dups == 0 & checker == . 
	drop dups 

	// assume no issues with small units
	replace checker = 1 if eligible <=15 & checker == . 

	// remaining cases are very similar but win/loss is different and
	// not clear how to reconcile counts in any meaningful way
	g flag = 1 if checker == .
	drop checker
	
	append using "${temp}/temp.dta"
	save "${temp}/temp.dta",replace

restore

drop if dups+1 == num

// Manual case by case review
// Cases where counts are very similar but SIC codes differ
// Assumed duplicate, all same and sic only marginally different
drop if sic3 == "235" & nlrb_id == "02-RC-19035"

// Missing details in 1961_1998 file and sic less likely to match with company name
drop if sic3 == "284" & nlrb_id == "02-RC-19128"

// Fomr 1961_1998 different closing date
// For quite a few of these it appears that the vote was challenged and then the election was closed
// at a later date. The SIC code has then changed and we assume, corrected.
drop if sic3 == "356" & nlrb_id == "04-RC-15821"
drop if sic3 == "341" & nlrb_id == "04-RC-15825"
drop if sic3 == "353" & nlrb_id == "04-RC-15860"
drop if sic3 == "353" & nlrb_id == "04-RD-01217"
drop if sic3 == "514" & nlrb_id == "05-RC-11663"
replace state1 = "MD" if nlrb_id == "05-RC-11663"

drop if sic3 == "334" & nlrb_id == "05-RC-11941"
drop if sic3 == "251" & nlrb_id == "10-RC-11235"
drop if sic3 == "16" & nlrb_id == "10-RC-11418"
drop if sic3 == "331" & nlrb_id == "10-RD-00925"
drop if sic3 == "342" & nlrb_id == "13-RD-01609"
drop if sic3 == "390" & nlrb_id == "17-RC-09663"
drop if sic3 == "581" & nlrb_id == "19-RC-10235"
drop if sic3 == "821" & nlrb_id == "22-RC-07198"
drop if sic3 == "483" & nlrb_id == "26-RC-06454"
drop if sic3 == "121" & nlrb_id == "28-RC-04039"
drop if sic3 == "508" & nlrb_id == "29-RC-04264"
drop if sic3 == "236" & nlrb_id == "29-RC-05492"
drop if sic3 == "390" & nlrb_id == "31-RC-04957"
drop if sic3 == "806" & nlrb_id == "33-RC-02460"
drop if sic3 == "806" & nlrb_id == "36-RC-04675"

drop if sic3 == "205" & nlrb_id == "02-RC-20731"


drop dups
duplicates tag nlrb_id unit ,g(dups)
tab dups // 28 remaining "duplicates"


append using "${temp}/temp.dta"
save "${temp}/nlrb7799_3_clean.dta",replace




/***************************************************************/
/************* SAVE CLEAN DATA ***************/

use "${temp}/nlrb7799_1.dta",clear
append using "${temp}/nlrb7799_2_clean.dta"
append using "${temp}/nlrb7799_3_clean.dta"
append using "${temp}/nlrb7799_4_clean.dta"
save "${wd}/nlrb/nlrb7799_02.dta",replace

// 104,954 vs 104,316 observations in clean data
// old data is 104,730





